In [58]:
!pip install numpy pandas matplotlib seaborn
!pip install scikit-learn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import numpy as np
from scipy.stats import zscore
from sklearn import linear_model
from sklearn.model_selection import train_test_split
Requirement already satisfied: numpy in c:\users\austin\anaconda3\lib\site-packages (1.26.4) Requirement already satisfied: pandas in c:\users\austin\anaconda3\lib\site-packages (2.2.2) Requirement already satisfied: matplotlib in c:\users\austin\anaconda3\lib\site-packages (3.8.4) Requirement already satisfied: seaborn in c:\users\austin\anaconda3\lib\site-packages (0.13.2) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\austin\anaconda3\lib\site-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\austin\anaconda3\lib\site-packages (from pandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\austin\anaconda3\lib\site-packages (from pandas) (2023.3) Requirement already satisfied: contourpy>=1.0.1 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (23.2) Requirement already satisfied: pillow>=8 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (10.3.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\austin\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: six>=1.5 in c:\users\austin\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Requirement already satisfied: scikit-learn in c:\users\austin\anaconda3\lib\site-packages (1.4.2) Requirement already satisfied: numpy>=1.19.5 in c:\users\austin\anaconda3\lib\site-packages (from scikit-learn) (1.26.4) Requirement already satisfied: scipy>=1.6.0 in c:\users\austin\anaconda3\lib\site-packages (from scikit-learn) (1.13.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\austin\anaconda3\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\austin\anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
In [59]:
# Introduction
# Our dataset comes from Kaggle, and can be found at here: https://www.kaggle.com/datasets/sudhanvahg/wind-turbine-power-generation-forecasting
# This dataset includes data on the hourly meteorological data along with power output measurements from wind turbines collected between Jan 2013
# and December 2016. The data is split into two portions, a train set and a test set. The included variables are temperature, relative humidity,
# windspeed at different altitudes (2m,10m, 100m), as well as dew point, and wind direction for these altitudes as well. There are 140160 entries
# in our train dataset, and 35040 entries in our test dataset. There are no null values.
# Let's also discuss our goal for the project a bit. My intentions are to create a model that can predict the power generated by turbines
# as accurately as possible, with our model producing an r^2 value of above .7 at the least when comparing test values to predicted values.
In [60]:
# What is regression?
# Linear regression is a method used to model relationships between data, and uses known values to predict unknown target values. It attempts to fit
# lines to our data, and then picks the best line to then predict target values based off of that line. One of the ways to measure how good of a model
# we have is the R-squared value of a model. The R-squared value is a ratio of the residual sum of squared divided by the total sum of squares.
In [61]:
wt_train_df_raw = pd.read_csv("train.csv");
wt_test_df_raw = pd.read_csv("test.csv");
In [62]:
wt_train_df_raw.describe()
Out[62]:
| Unnamed: 0 | Location | Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | Power | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 | 140160.000000 |
| mean | 17519.500000 | 2.500000 | 45.912162 | 70.420512 | 35.867228 | 4.352948 | 6.924278 | 200.684417 | 200.649010 | 8.027673 | 0.312437 |
| std | 10115.212797 | 1.118038 | 21.930554 | 17.000203 | 20.979720 | 2.027149 | 3.056636 | 100.079917 | 101.105919 | 3.626641 | 0.253774 |
| min | 0.000000 | 1.000000 | -31.420400 | 8.664205 | -36.627405 | 0.165389 | 0.007799 | 0.051683 | -0.942685 | 0.436515 | -0.000004 |
| 25% | 8759.750000 | 1.750000 | 29.579600 | 57.664205 | 21.172595 | 2.815389 | 4.747799 | 128.051683 | 127.057315 | 5.236515 | 0.099696 |
| 50% | 17519.500000 | 2.500000 | 45.879600 | 72.664205 | 35.672595 | 4.055389 | 6.717799 | 211.051683 | 211.057315 | 7.636515 | 0.246896 |
| 75% | 26279.250000 | 3.250000 | 64.579600 | 84.664205 | 53.872595 | 5.575389 | 8.847799 | 287.051683 | 289.057315 | 10.236515 | 0.486396 |
| max | 35039.000000 | 4.000000 | 94.479600 | 99.664205 | 78.272595 | 18.695389 | 24.597799 | 359.051683 | 359.057315 | 28.936515 | 0.988796 |
In [63]:
wt_test_df_raw.describe()
Out[63]:
| Unnamed: 0 | Location | Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 35040.000000 | 35040.00000 | 35040.000000 | 35040.000000 | 35040.000000 | 35040.000000 | 35040.000000 | 35040.000000 | 35040.000000 | 35040.000000 |
| mean | 39419.500000 | 2.50000 | 48.529326 | 67.415432 | 36.975266 | 4.203348 | 6.738558 | 200.467978 | 200.385882 | 7.762908 |
| std | 2528.830248 | 1.11805 | 22.215776 | 18.380048 | 20.722451 | 1.949184 | 2.988170 | 97.856886 | 98.978351 | 3.572855 |
| min | 35040.000000 | 1.00000 | -26.920400 | 15.664205 | -32.727405 | 0.165389 | 0.107799 | 0.051683 | -0.942685 | 0.436515 |
| 25% | 37229.750000 | 1.75000 | 31.879600 | 53.664205 | 22.672595 | 2.725389 | 4.587799 | 133.051683 | 132.057315 | 5.036515 |
| 50% | 39419.500000 | 2.50000 | 49.679600 | 68.664205 | 36.372595 | 3.885389 | 6.557799 | 205.051683 | 205.057315 | 7.336515 |
| 75% | 41609.250000 | 3.25000 | 66.879600 | 82.664205 | 55.072595 | 5.405389 | 8.637799 | 284.051683 | 286.057315 | 10.036515 |
| max | 43799.000000 | 4.00000 | 101.479600 | 99.664205 | 75.272595 | 15.535389 | 22.847799 | 359.051683 | 359.057315 | 29.136515 |
In [64]:
wt_train_df_raw.isna().sum()
Out[64]:
Unnamed: 0 0 Time 0 Location 0 Temp_2m 0 RelHum_2m 0 DP_2m 0 WS_10m 0 WS_100m 0 WD_10m 0 WD_100m 0 WG_10m 0 Power 0 dtype: int64
In [65]:
wt_test_df_raw.isna().sum()
Out[65]:
Unnamed: 0 0 Time 0 Location 0 Temp_2m 0 RelHum_2m 0 DP_2m 0 WS_10m 0 WS_100m 0 WD_10m 0 WD_100m 0 WG_10m 0 dtype: int64
In [66]:
wt_train_df_raw.head()
Out[66]:
| Unnamed: 0 | Time | Location | Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | Power | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 02-01-2013 00:00 | 1 | 28.2796 | 84.664205 | 24.072595 | 1.605389 | 1.267799 | 145.051683 | 161.057315 | 1.336515 | 0.163496 |
| 1 | 1 | 02-01-2013 01:00 | 1 | 28.1796 | 85.664205 | 24.272595 | 2.225389 | 3.997799 | 150.051683 | 157.057315 | 4.336515 | 0.142396 |
| 2 | 2 | 02-01-2013 02:00 | 1 | 26.5796 | 90.664205 | 24.072595 | 1.465389 | 2.787799 | 147.051683 | 149.057315 | 3.136515 | 0.121396 |
| 3 | 3 | 02-01-2013 03:00 | 1 | 27.1796 | 87.664205 | 23.872595 | 1.465389 | 2.697799 | 57.051683 | 104.057315 | 1.536515 | 0.100296 |
| 4 | 4 | 02-01-2013 04:00 | 1 | 27.0796 | 87.664205 | 23.672595 | 2.635389 | 4.437799 | 57.051683 | 83.057315 | 3.936515 | 0.079296 |
In [67]:
# Preprocessing for our Models
# So far, The first thing I see we can drop is the "unnamed: 0" column, this appears to just be an index column for our data, which we won't need.
# We'll also be dropping the location of each, since we really don't have any information on where exactly these locations are.
# We'll be dropping the time data as well for now. We'll likely revisit this later by potentially pulling out the time of day as well as
# splitting our entries into the 4 seasons.
# In addition to this, there may be some outliers in our dataset that we can get rid of to look at more accurate data, since we have such a
# large dataset it would be good to remove outliers as to not skew our results. We can check our graphs in a pairplot just to see if there are
# potentially any outliers.
In [68]:
wt_test_df = wt_test_df_raw.drop('Unnamed: 0', axis=1)
wt_test_df = wt_test_df.drop('Location', axis = 1)
wt_test_df = wt_test_df.drop('Time', axis = 1)
wt_test_df.head()
Out[68]:
| Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19.6796 | 89.664205 | 17.072595 | 1.085389 | 0.867799 | 192.051683 | 233.057315 | 1.636515 |
| 1 | 19.1796 | 90.664205 | 16.672595 | 1.665389 | 1.907799 | 183.051683 | 197.057315 | 2.736515 |
| 2 | 19.5796 | 88.664205 | 16.672595 | 1.845389 | 2.867799 | 162.051683 | 167.057315 | 2.736515 |
| 3 | 20.3796 | 85.664205 | 16.572595 | 2.325389 | 3.987799 | 166.051683 | 161.057315 | 2.936515 |
| 4 | 21.0796 | 82.664205 | 16.572595 | 2.265389 | 4.047799 | 182.051683 | 170.057315 | 3.136515 |
In [69]:
wt_train_df = wt_train_df_raw.drop('Unnamed: 0', axis=1)
wt_train_df = wt_train_df.drop('Location', axis = 1)
wt_train_df = wt_train_df.drop('Time', axis = 1)
wt_train_df.head()
Out[69]:
| Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | Power | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 28.2796 | 84.664205 | 24.072595 | 1.605389 | 1.267799 | 145.051683 | 161.057315 | 1.336515 | 0.163496 |
| 1 | 28.1796 | 85.664205 | 24.272595 | 2.225389 | 3.997799 | 150.051683 | 157.057315 | 4.336515 | 0.142396 |
| 2 | 26.5796 | 90.664205 | 24.072595 | 1.465389 | 2.787799 | 147.051683 | 149.057315 | 3.136515 | 0.121396 |
| 3 | 27.1796 | 87.664205 | 23.872595 | 1.465389 | 2.697799 | 57.051683 | 104.057315 | 1.536515 | 0.100296 |
| 4 | 27.0796 | 87.664205 | 23.672595 | 2.635389 | 4.437799 | 57.051683 | 83.057315 | 3.936515 | 0.079296 |
In [70]:
# Data Understanding/Visualization
sns.pairplot(wt_train_df)
Out[70]:
<seaborn.axisgrid.PairGrid at 0x2d422de91f0>
In [71]:
# based on the above pairplot, it does look like we have a few outliers here and there, especially in our windspeed graphs, so lets figure out more
# about how to deal with outliers. We can get the z-score from our data, which is a measurement that describes a values relationship relative to
# our data's mean (average). It is measured in standard deviations from the mean, so a higher z-score is more indicative of the value being an outlier.
# we'll use this to remove values that are more than 3 standard deviations from the mean. 3 standard deviations should include about 99.7% of our data,
# so this is a great way for us to omit our outliers.
In [72]:
# To explain what we're doing below, we take the absolute value of each z-score, and we're removing those entries which have values that have z-scores
# that are more than 3 standard deviations from the mean of our data in either direction (below or above).
columns_to_score = ['Temp_2m','RelHum_2m','DP_2m','WS_10m','WS_100m','WD_10m','WD_100m','WG_10m','Power']
z_scores = np.abs(wt_train_df[columns_to_score].apply(zscore))
filtered_rows = (z_scores < 3).all(axis=1)
wt_train_df_flt = wt_train_df[filtered_rows]
In [73]:
wt_train_df_flt.describe()
Out[73]:
| Temp_2m | RelHum_2m | DP_2m | WS_10m | WS_100m | WD_10m | WD_100m | WG_10m | Power | |
|---|---|---|---|---|---|---|---|---|---|
| count | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 | 138519.000000 |
| mean | 46.117744 | 70.478512 | 36.092496 | 4.282069 | 6.828647 | 200.037186 | 199.993453 | 7.910036 | 0.309281 |
| std | 21.803271 | 16.955317 | 20.827574 | 1.903810 | 2.911459 | 100.063676 | 101.088483 | 3.446334 | 0.251757 |
| min | -19.820400 | 19.664205 | -26.927405 | 0.165389 | 0.007799 | 0.051683 | -0.942685 | 0.436515 | -0.000004 |
| 25% | 29.679600 | 57.664205 | 21.272595 | 2.805389 | 4.717799 | 127.051683 | 126.057315 | 5.236515 | 0.098696 |
| 50% | 46.279600 | 72.664205 | 35.872595 | 4.015389 | 6.677799 | 210.051683 | 210.057315 | 7.536515 | 0.244196 |
| 75% | 64.679600 | 84.664205 | 54.072595 | 5.515389 | 8.767799 | 287.051683 | 288.057315 | 10.236515 | 0.480096 |
| max | 94.479600 | 99.664205 | 78.272595 | 10.425389 | 16.067799 | 359.051683 | 359.057315 | 18.836515 | 0.985896 |
In [74]:
# With this, we've cut our dataframe down from 140160 entries to 138519 entries, meaning we've removed about 1641 entries that were outliers
# from our data. Awesome!
# Now let's gauge the relationships between our values, to do this, we'll be using a correlation matrix.
In [75]:
sns.heatmap(wt_train_df_flt.corr(), annot = True)
plt.show()
In [76]:
# Our data with the highest correlation to our power outputs are the wind speeds at 10m and 100m from the ground,
# as well as the wind gusts at 10m above the ground.
In [77]:
# Model 1
In [81]:
# At a glance, our provided test set doesn't include any values for the power generated, which doesn't help us much in creating our model. So we'll need
# to split and use our train data set to create our model.
X = wt_train_df_flt.drop(labels = 'Power', axis = 1)
y = wt_train_df_flt['Power']
In [90]:
# i'll split my train data into training data into train and test sets
# wt_train, wt_test = train_test_split(wt_train_df_model, test_size = .2, random_state = 42)
# target_train,target_test = train_test_split(wt_train_df_target, test_size = .2, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)
X_train (110815, 8) X_test (27704, 8) y_train (110815,) y_test (27704,)
In [92]:
# Lets do a linear regression model on our train and test sets now!
wt_linear_model = linear_model.LinearRegression()
wt_linear_model.fit(X_train, y_train)
Out[92]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [93]:
# lets see how well our training data performs
y_pred = wt_linear_model.predict(X_train)
In [94]:
# This r squared value isn't great, this may not be a good model for our data
In [95]:
wt_linear_model.score(X_train, y_train)
Out[95]:
0.4754723450463729
In [ ]: